home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- Brewster@think.com
- */
-
- /* Copyright (c) CNIDR (see ../COPYRIGHT) */
-
-
- /* The memory hashtables for building an index. */
- /* -brewster 5/90 */
-
- /* Change log:
- * $Log: irhash.c,v $
- * Revision 1.1 1993/02/16 15:05:35 freewais
- * Initial revision
- *
- * Revision 1.16 92/03/20 11:04:33 jonathan
- * Added code to handle switches for word_pairs and word_postition info.
- *
- * Revision 1.15 92/02/12 13:26:28 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- */
-
- /* main functions:
- * add_word
- * finished_add_word
- * look_up_word
- *
- * The idea is to store up a bunch of words before going to disk.
- * A word entry points to where it will go on disk, and
- * accumulates the entries before doing it.
- *
- * Some of the policy issues in this file are:
- * How much weight should the first occurance of a word in a document get
- * over the other occurances. The first occurance should be worth more
- * so that words with 3 occurances of "dog" and not "cat"'s should not
- * win out over 1 "dog" and 1 "cat" if the question is "Tell me about cats
- * torture dogs"
- * The extra weight is 5 at this point.
- *
- */
-
- /* To Do:
- * done: Improve the hashing functions.
- * done: stop inserting into hash table after max number have been accumulated
- * done: make flush not flush buffers that are too big.
- */
-
- #include <ctype.h>
- #include <string.h> /* for strlen(), memset() */
-
- #include "panic.h"
- #include "cutil.h"
- #include "irfiles.h"
- #include "irhash.h"
- #include "stoplist.h"
- #include "irinv.h"
-
- #ifdef UNIX
- #define PRINT_AS_INDEXING true /* also defined in irtfiles.c and irfiles.c */
- #else
- #define PRINT_AS_INDEXING false
- #endif
-
- /* ================================
- === Word Occurance Buffers ===
- ================================ */
-
- /* Word occurance buffers
- * This is a simple memory allocator for use with the word memory hashtable.
- * Since the buffers are tiny, this is done as a copy-sweep GC scheme.
- * Oh, I long for the storage system of lisp.
- */
- unsigned char *first_word_occurance_buffer = NULL; /* allocate blocks out of this */
- unsigned char *last_word_occurance_buffer = NULL;
- long word_occurance_block_length = 256000; /* maybe this should be larger? */
- unsigned char * word_occurance_free_ptr = NULL;
-
- unsigned char *make_word_occurrance_block(size)
- long size;
-
- {
- /* allocates a word_occurance_block out of the buffers */
- /* old way: s_malloc((size_t)size); */
- /* returns a pointer to a piece of memory */
- if(NULL == first_word_occurance_buffer){
- /* initialize it */
- #ifdef WIN32
- first_word_occurance_buffer =
- (unsigned char *)s_malloc(MAX(word_occurance_block_length,
- (long)sizeof(size_t)+ size));
- #else
- first_word_occurance_buffer =
- (unsigned char *)s_malloc(MAX(word_occurance_block_length,
- sizeof(size_t)+ size));
- #endif
- *(unsigned char **)first_word_occurance_buffer = NULL; /* set the end */
- last_word_occurance_buffer = first_word_occurance_buffer;
- word_occurance_free_ptr = first_word_occurance_buffer + sizeof(size_t);
- }
- if((long)word_occurance_free_ptr + size >=
- word_occurance_block_length + (long)last_word_occurance_buffer){
- /* then allocate a new block */
- #ifdef WIN32
- unsigned char * new_block = (unsigned char *)s_malloc(MAX(word_occurance_block_length,
- (long)sizeof(size_t)+ size));
- #else
- unsigned char * new_block = (unsigned char *)s_malloc(MAX(word_occurance_block_length,
- sizeof(size_t)+ size));
- #endif
- *(unsigned char **)new_block = NULL; /* set the end of the chain */
- *(unsigned char **)last_word_occurance_buffer = new_block;
- word_occurance_free_ptr = new_block + sizeof(size_t);
- last_word_occurance_buffer = new_block;
- }
- /* allocate away */
- { unsigned char * answer = word_occurance_free_ptr;
- word_occurance_free_ptr += size;
- return(answer);
- }
- }
-
- void free_word_occurance_block(block)
- unsigned char *block;
- {
- /* this is not used with the new scheme, but is here in case
- malloc is a win on some systems */
- /* old way s_free(block); */
- }
-
- static void flush_word_occur_bufs_internal
- _AP((unsigned char* head_of_list));
-
- static void flush_word_occur_bufs_internal(head_of_list)
- unsigned char* head_of_list;
- /* frees all word occurance buffers. This should be done with care */
- {
- while(1){
- unsigned char * next_block;
- if(NULL == head_of_list)
- break;
- next_block = *(unsigned char **)head_of_list;
- s_free(head_of_list);
- head_of_list = next_block;
- }
- }
-
- void flush_word_occurance_buffers()
- {
- /* frees all word occurance buffers. This should be done with care */
- flush_word_occur_bufs_internal(first_word_occurance_buffer);
- first_word_occurance_buffer = NULL;
- word_occurance_free_ptr = NULL;
- last_word_occurance_buffer = NULL;
- }
-
-
- /* ---------------------------------------------------- */
- static hash_entry* look_up_word _AP((char* word,hashtable*
- the_word_memory_hashtable));
-
- static hash_entry*
- look_up_word(word,the_word_memory_hashtable)
- char* word;
- hashtable* the_word_memory_hashtable;
- {
- hash_entry * answer = get_hash(word, the_word_memory_hashtable);
-
- if(NULL != answer)
- return(answer);
- else{
- hash_entry wrd_entry;
- answer = put_hash(word, the_word_memory_hashtable, &wrd_entry);
- answer->number_of_occurances = 0;
- answer->memory_ptr =
- make_word_occurrance_block(WORD_MEMORY_INIT_BLOCK_SIZE);
- answer->current_memory_ptr = answer->memory_ptr;
- answer->memory_size = WORD_MEMORY_INIT_BLOCK_SIZE;
- answer->current_doc_id = 0;
- return(answer);
- }
- }
-
- static unsigned char add_weight _AP((long current_weight,long new_weight));
-
- static unsigned char
- add_weight(current_weight,new_weight)
- long current_weight;
- long new_weight;
- /* add a new weight to the existing one */
- {
- /* this should be smarter than this, like doing the log or something */
- if(127 < (current_weight + new_weight)){
- /* the max char. should be 255, but does not work on all compilers */
- return(127);
- }
- else{
- #ifdef WIN32
- return ((char)(current_weight + new_weight));
- #else
- return(current_weight + new_weight);
- #endif
- }
- }
-
- static unsigned char* more_memory _AP((unsigned char* current_memory_ptr,
- long current_memory_size,
- long new_size));
-
- static unsigned char* more_memory(current_memory_ptr,current_memory_size,new_size)
- unsigned char* current_memory_ptr;
- long current_memory_size;
- long new_size;
- /* Allocates more memory for a hash_entry. It transfers all the bytes
- * from the old to the new and then returns the new.
- */
- {
- unsigned char* new_memory = NULL;
- if(current_memory_size > new_size){
- panic("trying to contract a hash_entry block. This is not right\n");
- }
- new_memory = make_word_occurrance_block(new_size);
- if(NULL == new_memory){
- panic("Out of memory.");
- }
- memset(new_memory, 0, new_size);
- memmove(new_memory, current_memory_ptr, (size_t)current_memory_size);
- return(new_memory);
- }
-
- static long more_memory_size _AP((long current_size,
- long number_of_occurances));
-
- static long more_memory_size(current_size,number_of_occurances)
- long current_size;
- long number_of_occurances;
- /* This is pretty important to get right. This is a place holder */
- {
- return(MAX(2 * current_size, WORD_MEMORY_INIT_BLOCK_SIZE));
- }
-
- long write_bytes_to_memory(value,size,ptr)
- long value;
- long size;
- unsigned char* ptr;
- {
- /* writes the number into memory lsb first.
- returns the number of bytes written */
- long i;
- long original_value = value;
-
- if(size < 0) /* paranoia */
- panic("attempting to write a negative number of bytes");
-
- ptr += size; /* start at the end of the block and write backwards */
- for (i = 0; i < size; i++){
- ptr--;
- *ptr = (unsigned char)(value & 0xFF);
- value = value >> 8;
- }
- if(value != 0)
- panic("In a call to write_bytes_to_memory, the value %ld can not be represented in %ld bytes", original_value, size);
-
- return(size);
- }
-
- /* adds a word to the hashtable. Returns the 0 if successful.
- See irext.h for more documentation.
- */
- long add_word(word, char_pos, line_pos,
- weight, doc_id, date, word_pair, db, word_position)
- char *word; /* the word to be indexed, this could be a
- word pair. If NULL there are no more words
- to be indexed */
- long char_pos; /* the position of the start of the
- word */
- long line_pos; /* this is passed for the best
- section calculation */
- long weight; /* how important the word looks
- syntactically (such as is it bold)
- NOT used by signature system */
- long doc_id; /* current document, this will never be 0 */
- time_t date; /* display day of this document, 0 if not known */
- long word_pair;
- database* db; /* database to insert the document */
- boolean word_position; /* whether to have multiple entries for words in a document */
- {
- /* look up the word in the hashtable */
- /* creates it if necessary */
- hash_entry* wrd_entry;
- char* newword;
- t_Synonym* syn_Table = db->syn_Table;
- int syn_Table_Size = db->syn_Table_Size;
- hashtable * the_word_memory_hashtable = db->the_word_memory_hashtable;
- /* printf("Word: '%s' doc_id: %ld, pos: %ld, weight: %ld\n",
- word, doc_id, char_pos, weight); */
-
- if(NULL == db->the_word_memory_hashtable){
- panic("The memory word hashtable is not defined.");
- }
-
- /* if we have indexed enough words flush the memory copies to disk. */
- if(db->number_of_words_in_hashtable == db->flush_after_n_words)
- flush_memory_hashtable_to_disk(db, false);
-
- /* perform synonym table lookup and if the word is changed skip WordRoot */
- newword = lookup_Synonym( word,syn_Table,syn_Table_Size );
- strncpy( word,newword,MAX_WORD_LENGTH );
- wrd_entry = look_up_word(word, the_word_memory_hashtable);
- wrd_entry->number_of_occurances ++;
-
- if(wrd_entry->number_of_occurances > MAX_OCCURANCES){
- /* do nothing. we have enough of that word */
- }
- else{
- /* we have a word to add */
- db->number_of_words_in_hashtable ++;
-
- if(word_position || doc_id != wrd_entry->current_doc_id){
- /* then we have a new doc_id to add to the memory block */
-
- /* check to see if we need more memory */
- if((wrd_entry->memory_size -
- (wrd_entry->current_memory_ptr -
- wrd_entry->memory_ptr)
- <
- INDEX_ELEMENT_SIZE)){
- /* we need more memory. this makes more and frees the old*/
- unsigned char* old_memory_ptr = wrd_entry->memory_ptr;
-
- long new_size =
- more_memory_size(wrd_entry->memory_size,
- wrd_entry->number_of_occurances);
- /* cprintf(PRINT_AS_INDEXING, "Get more memory %ld bytes for %s\n", new_size, word); */
- wrd_entry->memory_ptr =
- more_memory(wrd_entry->memory_ptr, wrd_entry->memory_size,
- new_size);
- wrd_entry->current_memory_ptr =
- wrd_entry->memory_ptr + /* new offset */
- (wrd_entry->current_memory_ptr - old_memory_ptr);
- /* just being paranoid... no longer illegal
- if(wrd_entry->current_memory_ptr == wrd_entry->memory_ptr)
- panic("After allocating more memory, the size went to 0");
- */
- wrd_entry->memory_size = new_size;
- } /* finished making more memory */
-
- /* add away */
- wrd_entry->current_memory_ptr +=
- write_bytes_to_memory(doc_id, DOCUMENT_ID_SIZE,
- wrd_entry->current_memory_ptr);
- wrd_entry->current_memory_ptr +=
- write_bytes_to_memory(char_pos,
- CHARACTER_POSITION_SIZE,
- wrd_entry->current_memory_ptr);
- wrd_entry->current_memory_ptr +=
- write_bytes_to_memory(weight +
- /* add 5 since for the first one */
- (doc_id != wrd_entry->current_doc_id)?5:0,
- WEIGHT_SIZE,
- wrd_entry->current_memory_ptr);
- wrd_entry->current_doc_id = doc_id;
-
- }
- else{
- /* The word is already there,
- * just increment the weight in the record.
- * This will change when/if position information is kept (for proximity).
- */
- if(wrd_entry->current_memory_ptr == wrd_entry->memory_ptr){
- panic("Memory hashtable error. Recorded doc_id %ld, current doc_id %ld\n",
- wrd_entry->current_doc_id, doc_id);
- }
- *(wrd_entry->current_memory_ptr - 1) =
- add_weight(*(wrd_entry->current_memory_ptr - 1), weight);
- }
- }
- return(0L);
- }
-
- void add_stop_words(the_word_memory_hashtable)
- hashtable *the_word_memory_hashtable;
- /* add the stop words to the hashtable. this must be done before
- adding other words */
- {
- init_stop_list();
- while(true){
- char *word = next_stop_word();
- hash_entry* wrd_entry;
-
- if(NULL == word)
- break;
- wrd_entry = look_up_word(word, the_word_memory_hashtable);
- wrd_entry->number_of_occurances = STOP_WORD_FLAG;
- }
- }
-
-